Word embeddings


Import dependencies

ls | grep .csv


# %%bash
# pip3 install bokeh

# built-in libs
import email

# processing libs
import pandas as pd

# display libs
from tqdm import tqdm_notebook

Import data

emails_full_df = pd.read_csv('emails.csv', chunksize=10000)
emails_df = next(emails_full_df)

(10000, 2)
file message
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e...
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e...
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
file       10000 non-null object
message    10000 non-null object
dtypes: object(2)
memory usage: 156.3+ KB

messages_obj_lst = []
messages_str_lst = []

message_metadata = {}

for i in tqdm_notebook(range(emails_df.shape[0])):
    msg = email.message_from_string(emails_df.message[i])
    for msg_property in msg:
        if msg_property in message_metadata:
            message_metadata[msg_property][i] = msg[msg_property]
            message_metadata[msg_property] = ['N/A'] * emails_df.shape[0]
    payload = msg.get_payload() # decode=True
    messages_str_lst.append(payload) #.encode('utf-8').decode('unicode_escape')
    #except KeyboardInterrupt:
#    break
    #    break

print('messages_obj_lst size: %i' % len(messages_obj_lst))

messages_obj_lst size: 10000
Wall time: 4.53 µs
messages_obj_lst size: 10000

# update dataframe object
# emails_df.rename(columns = {'message':'message_obj'}, inplace = True)
emails_df = emails_df.assign(message_obj = pd.Series(messages_obj_lst).values)
emails_df = emails_df.assign(payload     = pd.Series(messages_str_lst).values)

# print(emails_df.payload.str.contains(r'\\'))
emails_df['payload'] = emails_df.payload.str.replace(r'\n', '')

file message message_obj payload
0 allen-p/_sent_mail/1. Message-ID: <18782981.1075855378110.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Here is our forecast
1 allen-p/_sent_mail/10. Message-ID: <15464986.1075855378456.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Traveling to have a business meeting takes the...
2 allen-p/_sent_mail/100. Message-ID: <24216240.1075855687451.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... test successful. way to go!!!
3 allen-p/_sent_mail/1000. Message-ID: <13505866.1075863688222.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Randy, Can you send me a schedule of the salar...
4 allen-p/_sent_mail/1001. Message-ID: <30922949.1075863688243.JavaMail.e... [Message-ID, Date, From, To, Subject, Mime-Ver... Let's shoot for Tuesday at 11:45.

# del messages_obj_lst
# del messages_str_lst

emails_df.drop('message', axis=1, inplace=True)

corpus_text = '\n'.join(emails_df[:50000]['payload'])
sentences = corpus_text.split('\n')
sentences = [line.lower().split(' ') for line in sentences]

def clean(s):
    return [w.strip(',."!?:;()\'') for w in s]
sentences = [clean(s) for s in sentences if len(s) > 0]

from gensim.models import Word2Vec

model = Word2Vec(sentences, size=100, window=5, min_count=3, workers=4)

vectors = model.wv
# del model

In [15]:

array([-0.270508  , -0.17306764,  1.6283128 ,  0.0789329 ,  0.31106964,
        0.769532  ,  1.2730443 , -0.8092405 ,  0.7060038 ,  0.86828023,
       -2.6277056 , -1.3929644 ,  0.6448156 , -0.7771182 , -1.6537852 ,
       -0.4743401 , -1.1166382 ,  1.1569368 , -1.1398625 ,  0.80520093,
       -1.7167239 , -1.5579057 ,  0.10402635,  3.0918787 , -0.0558991 ,
       -0.43233722, -2.051206  , -0.66570055,  1.5504636 , -0.2648149 ,
        0.11560618, -0.32946193, -0.372461  , -0.781641  ,  1.0626622 ,
       -0.5553393 ,  0.5192849 ,  2.4005246 ,  0.05692073, -2.3076432 ,
       -1.5484774 , -0.67129016,  1.7084714 ,  0.68807465, -0.2931756 ,
        0.6166011 ,  1.0874461 , -0.32894936, -2.7945118 ,  0.0930008 ,
       -0.46457678,  1.2848035 , -1.4603778 ,  0.22172059, -0.99450624,
       -1.0969896 , -2.3467455 ,  0.4534696 ,  0.4488058 , -0.7499471 ,
        1.3231988 ,  1.7316022 ,  0.3932503 ,  0.06664114,  0.47186232,
        2.9767272 , -0.49195403,  2.0907822 ,  1.1899747 ,  1.2920406 ,
        0.4943122 , -1.3712525 ,  0.35063776, -1.9195726 ,  1.0069174 ,
       -2.0902    , -0.33823916, -1.9204639 , -0.7886482 ,  2.2157645 ,
       -0.8052555 ,  0.6665139 , -1.1551962 , -0.8287558 ,  0.05057469,
        2.554974  , -0.62844616,  1.7241517 , -0.09033989,  0.4090363 ,
        0.8304872 , -0.9974313 ,  0.94597244, -0.65530026,  0.4551282 ,
        0.7032987 , -0.13521333, -1.385317  , -0.15468638,  0.7746631 ],

print(vectors.similarity('you', 'your'))
print(vectors.similarity('you', 'internet'))

[('correspondence', 0.7597194910049438),
 ('havestructured', 0.7563828825950623),
 ('throw', 0.7550570964813232),
 ('=with', 0.7486575841903687),
 ('refight', 0.7483857870101929),
 ('thinkof', 0.7468332648277283),
 ('do.>', 0.7420369386672974),
 ('visitors', 0.741276204586029),
 ('nik', 0.7395448684692383),
 ('pleaded', 0.7374611496925354)]

# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count) for term, voc in model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key=lambda k: -k[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)
# print(ordered_terms)
# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(model.wv.syn0norm[term_indices, :], index=ordered_terms)


0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
0.103211 -0.001504 0.051223 0.082046 -0.015313 -0.031139 0.239088 -0.023072 0.260240 0.007476 ... -0.152058 -0.074253 -0.112685 -0.128899 -0.013398 -0.003880 0.017925 -0.033080 0.026232 0.085833
the 0.113068 0.025233 0.103621 0.153182 0.158703 -0.112839 0.170684 -0.030864 -0.080816 0.092015 ... 0.139859 -0.013526 -0.067141 -0.132761 0.028204 0.078747 0.030681 -0.142466 0.065911 -0.094258
to -0.170276 0.023519 -0.007925 0.020925 0.152509 -0.023285 0.163287 -0.003122 -0.051574 -0.069945 ... 0.040197 0.071331 0.091307 -0.199199 -0.065438 0.018316 -0.014435 -0.107577 0.190768 -0.028655

3 rows × 100 columns

def get_related_terms(token, topn=10):
    look up the topn most similar terms to token
    and print them as a formatted list

    for word, similarity in model.most_similar(positive=[token], topn=topn):
        print (word, round(similarity, 3))

societe 0.8
criminal 0.795
prohibited.neither 0.785
disclosureby> 0.784
representations 0.779
intensify 0.771
>others 0.769
asinformation 0.767
unauthorized 0.767
byothers 0.764
nero 0.861
howthis 0.824
al,i 0.824
beleivehowthis 0.815
rivera 0.813
after-what 0.807
disasterthanksgiving 0.797
mullick 0.795
asti 0.793
$175.00 0.792
bridge 0.751
agreement 0.743
transaction 0.737
partnerwill 0.725
unit 0.717
dead 0.712
maximum 0.709
package 0.707
fee 0.705
bond 0.701

nat 0.731
natural 0.697
swap 0.692
disaster 0.669
strip 0.661
vpenanat 0.654
curve.3 0.647
emit 0.643
year> 0.639
projs 0.636
def word_algebra(add=[], subtract=[], topn=1):
    combine the vectors associated with the words provided
    in add= and subtract=, look up the topn most similar
    terms to the combined vector, and print the result(s)
    answers = model.most_similar(positive=add, negative=subtract, topn=topn)
    for term, similarity in answers:

word_algebra(add=[u'i', u'will'])

word_algebra(add=[u'you', u'will'])

word_algebra(add=[u'i', u'am'])

word_algebra(add=[u'mother', u'fuck'])

from sklearn.manifold import TSNE

tsne_input = word_vectors
tsne_input = tsne_input.head(5000)

0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
0.103211 -0.001504 0.051223 0.082046 -0.015313 -0.031139 0.239088 -0.023072 0.260240 0.007476 ... -0.152058 -0.074253 -0.112685 -0.128899 -0.013398 -0.003880 0.017925 -0.033080 0.026232 0.085833
the 0.113068 0.025233 0.103621 0.153182 0.158703 -0.112839 0.170684 -0.030864 -0.080816 0.092015 ... 0.139859 -0.013526 -0.067141 -0.132761 0.028204 0.078747 0.030681 -0.142466 0.065911 -0.094258

2 rows × 100 columns

tsne = TSNE()
tsne_vectors = tsne.fit_transform(tsne_input.values)

tsne_vectors = pd.DataFrame(tsne_vectors,
                            columns=[u'x_coord', u'y_coord'])


x_coord y_coord
-43.682186 33.196354
the 60.936298 -31.871368
to 55.425861 -30.385191
and 33.788177 10.613073
of 35.482513 7.884809

tsne_vectors[u'word'] = tsne_vectors.index

x_coord y_coord word
-43.682186 33.196354
the 60.936298 -31.871368 the
to 55.425861 -30.385191 to
and 33.788177 10.613073 and
of 35.482513 7.884809 of

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value


Loading BokehJS ...

# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(tsne_vectors)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# engage!

